import argparse
import sys
import os
import re
from collections import Counter
from timeit import default_timer as timer

parser = argparse.ArgumentParser()

parser.add_argument('-c', '--letter', help="统计字母出现的频率")
parser.add_argument('-f', '--filename', help="需要处理的文本名称", default=None)
parser.add_argument('-n', '--nWords', help="输出的单词或短语个数", type=int)
parser.add_argument('-x', '--stopFile', help="停词表名称", default=None)
parser.add_argument('-v', '--verbFile', help="动词变化的文件名称", default=None)
parser.add_argument('-p', '--phraseLen', help="查询常用词组的长度", type=int)
parser.add_argument('-d', '--dirname', help="非递归遍历该目录", default=None)
parser.add_argument('-s', '--dir', help="递归遍历遍历目录", default=None)


# 读取文本数据
def get_context(filename):
    try:
        context = open(filename, 'r').read()
        context = context.lower()  # 小写
        return context
    except:  # 文件名输入错误
        sys.exit()


# 值从大到小，键按照字典序排序,并返回；list
def sort_dict(my_dict):
    my_list = list(my_dict.items())
    my_list.sort(key=lambda x: (-x[1], x[0]))
    return my_list


# 统计字母出现的频率
def calculate_letter_times(filename):
    start_time = timer()
    context = get_context(filename)

    letter_dict = {}
    for letter in context:
        if 'a' <= letter <= 'z':
            letter_dict[letter] = letter_dict.get(letter, 0) + 1

    total = sum(letter_dict.values())  # 总数

    letter_dict.update(zip(letter_dict.keys(), [val / total for val in letter_dict.values()]))
    letter_list = sort_dict(letter_dict)

    print(f"文件{filename}中的各字母出现频率如下：")
    for k, v in letter_list:
        print('{} : {:.2f}%'.format(k, v * 100))

    end_time = timer()
    spend_time = end_time - start_time
    return spend_time


#######################################################################################
# @author:xincheng-q
# time: 2022-09-15
# count_frequent_word()
# @filename:需要统计的文件名
# @word_num:需要展示的单词数量,即支持参数-n
# @stop_filename:默认为None，支持参数 -x xxx.txt
# @verb_form_file:默认为None，支持参数-v时，将各词的形态统一。
#######################################################################################
def count_frequent_word(filename, word_num=0, stop_filename=None, verb_form_file=None):
    start_time = timer()

    context = get_context(filename)
    punctuation = '!"#$%^&*()_-=+`~,./;:’[]{\|<>'
    for ch in punctuation:
        context = context.replace(ch, " ")

    if stop_filename is not None:
        stop_list = get_context(stop_filename).split()  # stopwords list

    words = context.split()
    word_dict = {}
    for word in words:
        if word.isalpha():  # 全部是字母
            if stop_filename is not None and word in stop_list:  # 停词list
                continue
            else:
                word_dict[word] = word_dict.get(word, 0) + 1
    # print(word_dict)
    if verb_form_file is not None:
        verb_file = get_context(verb_form_file).strip().split('\n')
        for verb_phrase in verb_file:
            # verb_phrase: abandon -> attends,attending,attended
            key, values = verb_phrase.split(" -> ")
            values_list = values.split(',')
            for key_word in values_list:
                if word_dict.get(key_word) is not None:
                    word_dict[key] += word_dict[key_word]  # 加上其变形的value
                    del word_dict[key_word]

    word_list = sort_dict(word_dict)
    if word_num == 0:
        word_num = len(word_list)
    for item in range(min(word_num, len(word_list))):  # 取最小值，防止想要输出10个单词但是文本中没有10个
        k, v = word_list[item]
        print('{0:<20} : {1:>5}次'.format(k, v))

    end_time = timer()
    spend_time = end_time - start_time
    print('The time consumed is {}s \n'.format(spend_time))


def judge_file_type(filename):  # 判断文本类型，txt返回true, 否则返回false
    try:
        suffix = filename.split('.')[1]
        return suffix in ['txt']
    except:
        pass


# 不递归遍历目录 即支持参数-d
def traverse_dir(dir_name, word_num=0, stop_filename=None, verb_filename=None):
    if os.path.isdir(dir_name):
        for text in os.listdir(dir_name):
            if judge_file_type(text):
                print(f"在{text}文本中，单词出现的频率如下：")
                query_filename = os.path.join(dir_name, text)  # 这里需要把整个路径传进去
                count_frequent_word(filename=query_filename, word_num=word_num,
                                    stop_filename=stop_filename, verb_form_file=verb_filename)
                print("------------------分割线------------------\n")
    else:
        print(f"当前路径 {dir_name} 无文件")


# 递归遍历目录 即支持参数-d
def recursive_traverse_dir(cur_dir_name, word_num=0, stop_filename=None, verb_filename=None):  # -s 递归遍历
    for cur_dir, dir_list, text_list in os.walk(cur_dir_name):
        for text_name in text_list:
            if judge_file_type(text_name):
                cur_path = os.path.join(cur_dir, text_name)
                print(f"在{cur_path}文本中，单词出现的频率如下：")
                count_frequent_word(filename=cur_path, word_num=word_num,
                                    stop_filename=stop_filename, verb_form_file=verb_filename)
                print("------------------分割线------------------\n")
        # print(f'cur_dir =  {cur_dir}, dir_list = {dir_list}, text_list = {text_list}')
        # if len(dir_list) > 0:
        #     for dir_name in dir_list:  # 有重复遍历文件的问题，尚未修改
        #         print(dir_name)
        #         traverse_dir(func=func, dir_name=os.path.join(cur_dir, dir_name))
        # else:
        #     traverse_dir(func=func, dir_name=cur_dir)


def query_phrase(query_filename, length_of_phrase, stop_filename=None):  # -p
    context = get_context(query_filename)
    context = context.replace('\n', ' ')

    re_single_word = r'(([a-z]+ )+[a-z]+)'  # 将整个文本抽取成单个的句子
    pattern = re.compile(re_single_word)
    sentence = pattern.findall(context)  # list -> 包含所有的句子
    txt = ','.join(sentence[i][0] for i in range(len(sentence)))

    regex = "[a-z]+[0-9]*"  # 匹配每一个单词
    pattern = regex
    for i in range(length_of_phrase - 1):  # 匹配一句话中的length个单词
        pattern += "[\s|,][a-z]+[0-9]*"

    word_list = []
    for i in range(length_of_phrase):
        if i == 0:
            temp_list = re.findall(pattern, txt)
        else:
            word_pattern = regex
            txt = re.sub(word_pattern, '', txt, 1).strip()
            temp_list = re.findall(pattern, txt)
        word_list += temp_list
    temp_counter = Counter(word_list)
    dic_num = {}
    phrases = temp_counter.keys()

    stop_phrase_list = []
    if stop_filename is not None:
        stop_phrase_list = get_context(stop_filename).strip().split('\n')  # 得到无意义的词组list v

    total_num = 0
    for phrase in phrases:
        if ',' not in phrase:
            if len(stop_phrase_list) > 0 and phrase in stop_phrase_list:
                continue
            else:
                dic_num[phrase] = temp_counter[phrase]
                total_num += temp_counter[phrase]
    dic_list = sort_dict(dic_num)
    print(f'长度为{length_of_phrase}的常用短语出现的频率如下：')
    for k, v in dic_list:
        if v > 1:
            print('{0:<20} : {1:>5}次'.format(k, v))


if __name__ == '__main__':
    args = parser.parse_args()
    if args.letter:
        calculate_letter_times(args.letter)
    elif args.filename:
        if args.nWords and args.stopFile and args.verbFile:
            count_frequent_word(filename=args.filename, word_num=args.nWords, stop_filename=args.stopFile,
                                verb_form_file=args.verbFile)

        if args.nWords and args.stopFile:  # -x xx.txt -n x -f xx.txt
            count_frequent_word(filename=args.filename, word_num=args.nWords, stop_filename=args.stopFile)
        if args.verbFile and args.stopFile:  # -v xx.txt  -x xx.txt -f xx.txt
            count_frequent_word(filename=args.filename, stop_filename=args.stopFile, verb_form_file=args.verbFile)
        if args.verbFile and args.nWords:  # -v xx.txt  -n x -f xx.txt
            count_frequent_word(filename=args.filename, word_num=args.nWords, verb_form_file=args.verbFile)

        if args.stopFile:  # -x xx.txt -f xx.txt
            count_frequent_word(filename=args.filename, stop_filename=args.stopFile)
        if args.verbFile:  # -v xx.txt -f xx.txt
            count_frequent_word(filename=args.filename, verb_form_file=args.verbFile)
        if args.nWords:     # -n x -f xx.txt
            count_frequent_word(filename=args.filename, word_num=args.nWords)

        if args.phraseLen:  # -p
            query_phrase(query_filename=args.filename, length_of_phrase=args.phraseLen, stop_filename='stoppharse.txt')
        if not args.nWords and not args.stopFile and not args.verbFile and not args.phraseLen:
            count_frequent_word(filename=args.filename)
    elif args.dirname:
        traverse_dir(dir_name=args.dirname)
    elif args.dir:
        recursive_traverse_dir(cur_dir_name=args.dir)
